From 209883dcfc7fd135d5cf1c5e0eb26fc90cac5fe5 Mon Sep 17 00:00:00 2001 From: Yves-Alexis Perez Date: Tue, 23 Jan 2018 08:23:44 +0100 Subject: [PATCH] Revert "sched/rt: Simplify the IPI based RT balancing logic" This reverts commit 1c37ff78298a6b6063649123356a312e1cce12ca which is commit 4bdced5c9a2922521e325896a7bbbf0132c94e56 upstream. This commit removes two fields from struct rt_rq which is used in struct sched_rt_entity, used in turn in struct task_struct. This turns into a large ABI change for an enhancement fix, so revert that for now. Gbp-Pq: Topic debian Gbp-Pq: Name revert-sched-rt-Simplify-the-IPI-based-RT-balancing-.patch --- kernel/sched/core.c | 6 -- kernel/sched/rt.c | 235 ++++++++++++++++++++++--------------------- kernel/sched/sched.h | 24 ++--- 3 files changed, 127 insertions(+), 138 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index e5066955cc3a..d748d4e5455d 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5877,12 +5877,6 @@ static int init_rootdomain(struct root_domain *rd) if (!zalloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) goto free_dlo_mask; -#ifdef HAVE_RT_PUSH_IPI - rd->rto_cpu = -1; - raw_spin_lock_init(&rd->rto_lock); - init_irq_work(&rd->rto_push_work, rto_push_irq_work_func); -#endif - init_dl_bw(&rd->dl_bw); if (cpudl_init(&rd->cpudl) != 0) goto free_dlo_mask; diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 7a360d6f6798..34b1133cbac3 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -72,6 +72,10 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b) raw_spin_unlock(&rt_b->rt_runtime_lock); } +#if defined(CONFIG_SMP) && defined(HAVE_RT_PUSH_IPI) +static void push_irq_work_func(struct irq_work *work); +#endif + void init_rt_rq(struct rt_rq *rt_rq) { struct rt_prio_array *array; @@ -91,6 +95,13 @@ void init_rt_rq(struct rt_rq *rt_rq) rt_rq->rt_nr_migratory = 0; rt_rq->overloaded = 0; plist_head_init(&rt_rq->pushable_tasks); + +#ifdef HAVE_RT_PUSH_IPI + rt_rq->push_flags = 0; + rt_rq->push_cpu = nr_cpu_ids; + raw_spin_lock_init(&rt_rq->push_lock); + init_irq_work(&rt_rq->push_work, push_irq_work_func); +#endif #endif /* CONFIG_SMP */ /* We start is dequeued state, because no RT tasks are queued */ rt_rq->rt_queued = 0; @@ -1853,166 +1864,160 @@ static void push_rt_tasks(struct rq *rq) } #ifdef HAVE_RT_PUSH_IPI - /* - * When a high priority task schedules out from a CPU and a lower priority - * task is scheduled in, a check is made to see if there's any RT tasks - * on other CPUs that are waiting to run because a higher priority RT task - * is currently running on its CPU. In this case, the CPU with multiple RT - * tasks queued on it (overloaded) needs to be notified that a CPU has opened - * up that may be able to run one of its non-running queued RT tasks. - * - * All CPUs with overloaded RT tasks need to be notified as there is currently - * no way to know which of these CPUs have the highest priority task waiting - * to run. Instead of trying to take a spinlock on each of these CPUs, - * which has shown to cause large latency when done on machines with many - * CPUs, sending an IPI to the CPUs to have them push off the overloaded - * RT tasks waiting to run. - * - * Just sending an IPI to each of the CPUs is also an issue, as on large - * count CPU machines, this can cause an IPI storm on a CPU, especially - * if its the only CPU with multiple RT tasks queued, and a large number - * of CPUs scheduling a lower priority task at the same time. - * - * Each root domain has its own irq work function that can iterate over - * all CPUs with RT overloaded tasks. Since all CPUs with overloaded RT - * tassk must be checked if there's one or many CPUs that are lowering - * their priority, there's a single irq work iterator that will try to - * push off RT tasks that are waiting to run. - * - * When a CPU schedules a lower priority task, it will kick off the - * irq work iterator that will jump to each CPU with overloaded RT tasks. - * As it only takes the first CPU that schedules a lower priority task - * to start the process, the rto_start variable is incremented and if - * the atomic result is one, then that CPU will try to take the rto_lock. - * This prevents high contention on the lock as the process handles all - * CPUs scheduling lower priority tasks. - * - * All CPUs that are scheduling a lower priority task will increment the - * rt_loop_next variable. This will make sure that the irq work iterator - * checks all RT overloaded CPUs whenever a CPU schedules a new lower - * priority task, even if the iterator is in the middle of a scan. Incrementing - * the rt_loop_next will cause the iterator to perform another scan. + * The search for the next cpu always starts at rq->cpu and ends + * when we reach rq->cpu again. It will never return rq->cpu. + * This returns the next cpu to check, or nr_cpu_ids if the loop + * is complete. * + * rq->rt.push_cpu holds the last cpu returned by this function, + * or if this is the first instance, it must hold rq->cpu. */ static int rto_next_cpu(struct rq *rq) { - struct root_domain *rd = rq->rd; - int next; + int prev_cpu = rq->rt.push_cpu; int cpu; + cpu = cpumask_next(prev_cpu, rq->rd->rto_mask); + /* - * When starting the IPI RT pushing, the rto_cpu is set to -1, - * rt_next_cpu() will simply return the first CPU found in - * the rto_mask. - * - * If rto_next_cpu() is called with rto_cpu is a valid cpu, it - * will return the next CPU found in the rto_mask. - * - * If there are no more CPUs left in the rto_mask, then a check is made - * against rto_loop and rto_loop_next. rto_loop is only updated with - * the rto_lock held, but any CPU may increment the rto_loop_next - * without any locking. + * If the previous cpu is less than the rq's CPU, then it already + * passed the end of the mask, and has started from the beginning. + * We end if the next CPU is greater or equal to rq's CPU. */ - for (;;) { - - /* When rto_cpu is -1 this acts like cpumask_first() */ - cpu = cpumask_next(rd->rto_cpu, rd->rto_mask); - - rd->rto_cpu = cpu; - - if (cpu < nr_cpu_ids) - return cpu; - - rd->rto_cpu = -1; + if (prev_cpu < rq->cpu) { + if (cpu >= rq->cpu) + return nr_cpu_ids; + } else if (cpu >= nr_cpu_ids) { /* - * ACQUIRE ensures we see the @rto_mask changes - * made prior to the @next value observed. - * - * Matches WMB in rt_set_overload(). + * We passed the end of the mask, start at the beginning. + * If the result is greater or equal to the rq's CPU, then + * the loop is finished. */ - next = atomic_read_acquire(&rd->rto_loop_next); - - if (rd->rto_loop == next) - break; - - rd->rto_loop = next; + cpu = cpumask_first(rq->rd->rto_mask); + if (cpu >= rq->cpu) + return nr_cpu_ids; } + rq->rt.push_cpu = cpu; - return -1; + /* Return cpu to let the caller know if the loop is finished or not */ + return cpu; } -static inline bool rto_start_trylock(atomic_t *v) +static int find_next_push_cpu(struct rq *rq) { - return !atomic_cmpxchg_acquire(v, 0, 1); -} + struct rq *next_rq; + int cpu; -static inline void rto_start_unlock(atomic_t *v) -{ - atomic_set_release(v, 0); + while (1) { + cpu = rto_next_cpu(rq); + if (cpu >= nr_cpu_ids) + break; + next_rq = cpu_rq(cpu); + + /* Make sure the next rq can push to this rq */ + if (next_rq->rt.highest_prio.next < rq->rt.highest_prio.curr) + break; + } + + return cpu; } +#define RT_PUSH_IPI_EXECUTING 1 +#define RT_PUSH_IPI_RESTART 2 + static void tell_cpu_to_push(struct rq *rq) { - int cpu = -1; - - /* Keep the loop going if the IPI is currently active */ - atomic_inc(&rq->rd->rto_loop_next); - - /* Only one CPU can initiate a loop at a time */ - if (!rto_start_trylock(&rq->rd->rto_loop_start)) - return; + int cpu; - raw_spin_lock(&rq->rd->rto_lock); + if (rq->rt.push_flags & RT_PUSH_IPI_EXECUTING) { + raw_spin_lock(&rq->rt.push_lock); + /* Make sure it's still executing */ + if (rq->rt.push_flags & RT_PUSH_IPI_EXECUTING) { + /* + * Tell the IPI to restart the loop as things have + * changed since it started. + */ + rq->rt.push_flags |= RT_PUSH_IPI_RESTART; + raw_spin_unlock(&rq->rt.push_lock); + return; + } + raw_spin_unlock(&rq->rt.push_lock); + } - /* - * The rto_cpu is updated under the lock, if it has a valid cpu - * then the IPI is still running and will continue due to the - * update to loop_next, and nothing needs to be done here. - * Otherwise it is finishing up and an ipi needs to be sent. - */ - if (rq->rd->rto_cpu < 0) - cpu = rto_next_cpu(rq); + /* When here, there's no IPI going around */ - raw_spin_unlock(&rq->rd->rto_lock); + rq->rt.push_cpu = rq->cpu; + cpu = find_next_push_cpu(rq); + if (cpu >= nr_cpu_ids) + return; - rto_start_unlock(&rq->rd->rto_loop_start); + rq->rt.push_flags = RT_PUSH_IPI_EXECUTING; - if (cpu >= 0) - irq_work_queue_on(&rq->rd->rto_push_work, cpu); + irq_work_queue_on(&rq->rt.push_work, cpu); } /* Called from hardirq context */ -void rto_push_irq_work_func(struct irq_work *work) +static void try_to_push_tasks(void *arg) { - struct rq *rq; + struct rt_rq *rt_rq = arg; + struct rq *rq, *src_rq; + int this_cpu; int cpu; - rq = this_rq(); + this_cpu = rt_rq->push_cpu; - /* - * We do not need to grab the lock to check for has_pushable_tasks. - * When it gets updated, a check is made if a push is possible. - */ + /* Paranoid check */ + BUG_ON(this_cpu != smp_processor_id()); + + rq = cpu_rq(this_cpu); + src_rq = rq_of_rt_rq(rt_rq); + +again: if (has_pushable_tasks(rq)) { raw_spin_lock(&rq->lock); - push_rt_tasks(rq); + push_rt_task(rq); raw_spin_unlock(&rq->lock); } - raw_spin_lock(&rq->rd->rto_lock); - /* Pass the IPI to the next rt overloaded queue */ - cpu = rto_next_cpu(rq); + raw_spin_lock(&rt_rq->push_lock); + /* + * If the source queue changed since the IPI went out, + * we need to restart the search from that CPU again. + */ + if (rt_rq->push_flags & RT_PUSH_IPI_RESTART) { + rt_rq->push_flags &= ~RT_PUSH_IPI_RESTART; + rt_rq->push_cpu = src_rq->cpu; + } - raw_spin_unlock(&rq->rd->rto_lock); + cpu = find_next_push_cpu(src_rq); - if (cpu < 0) + if (cpu >= nr_cpu_ids) + rt_rq->push_flags &= ~RT_PUSH_IPI_EXECUTING; + raw_spin_unlock(&rt_rq->push_lock); + + if (cpu >= nr_cpu_ids) return; + /* + * It is possible that a restart caused this CPU to be + * chosen again. Don't bother with an IPI, just see if we + * have more to push. + */ + if (unlikely(cpu == rq->cpu)) + goto again; + /* Try the next RT overloaded CPU */ - irq_work_queue_on(&rq->rd->rto_push_work, cpu); + irq_work_queue_on(&rt_rq->push_work, cpu); +} + +static void push_irq_work_func(struct irq_work *work) +{ + struct rt_rq *rt_rq = container_of(work, struct rt_rq, push_work); + + try_to_push_tasks(rt_rq); } #endif /* HAVE_RT_PUSH_IPI */ diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index cff985feb6e7..ad77d666583c 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -463,7 +463,7 @@ static inline int rt_bandwidth_enabled(void) } /* RT IPI pull logic requires IRQ_WORK */ -#if defined(CONFIG_IRQ_WORK) && defined(CONFIG_SMP) +#ifdef CONFIG_IRQ_WORK # define HAVE_RT_PUSH_IPI #endif @@ -485,6 +485,12 @@ struct rt_rq { unsigned long rt_nr_total; int overloaded; struct plist_head pushable_tasks; +#ifdef HAVE_RT_PUSH_IPI + int push_flags; + int push_cpu; + struct irq_work push_work; + raw_spinlock_t push_lock; +#endif #endif /* CONFIG_SMP */ int rt_queued; @@ -566,19 +572,6 @@ struct root_domain { struct dl_bw dl_bw; struct cpudl cpudl; -#ifdef HAVE_RT_PUSH_IPI - /* - * For IPI pull requests, loop across the rto_mask. - */ - struct irq_work rto_push_work; - raw_spinlock_t rto_lock; - /* These are only updated and read within rto_lock */ - int rto_loop; - int rto_cpu; - /* These atomics are updated outside of a lock */ - atomic_t rto_loop_next; - atomic_t rto_loop_start; -#endif /* * The "RT overload" flag: it gets set if a CPU has more than * one runnable RT task. @@ -591,9 +584,6 @@ struct root_domain { extern struct root_domain def_root_domain; -#ifdef HAVE_RT_PUSH_IPI -extern void rto_push_irq_work_func(struct irq_work *work); -#endif #endif /* CONFIG_SMP */ /* -- 2.30.2